Comparing numerical data across groups
Contents
Comparing numerical data across groups#
Setup#
import pandas as pd
import altair as alt
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
alt.data_transformers.disable_max_rows()
DataTransformerRegistry.enable('default')
Data#
Import data#
ROOT = "https://raw.githubusercontent.com/kirenz/datasets/master/"
DATA = "county.csv"
df = pd.read_csv(ROOT + DATA)
# Select only relevant variables
data_selection = ["state", "name", "pop_change",
"population_change", "median_hh_income", "metro"]
df = df[data_selection]
Data corrections#
# drop missing values
df.dropna(inplace=True)
# rename variable
df.rename(columns={'population_change': 'change'}, inplace=True)
# change data type
df['change'] = df['change'].astype("category")
Analysis#
df['change'].value_counts()
no gain 1285
gain 1275
Name: change, dtype: int64
df['metro'].value_counts()
no 1615
yes 945
Name: metro, dtype: int64
Histogram for two groups#
alt.Chart(df).mark_bar().encode(
x=alt.X("median_hh_income",
bin=alt.BinParams(maxbins=50)),
y=alt.Y('count()'),
color=alt.Color('change')
)
Side-by-side box plot#
alt.Chart(df).mark_boxplot().encode(
x=alt.X('median_hh_income'),
y=alt.Y('change'),
color=alt.Color('change'),
).properties(
width=400,
height=150,
)
Faceting#
alt.Chart(df).mark_bar().encode(
alt.X("median_hh_income", bin=alt.BinParams(maxbins=50)),
alt.Y('count()'),
alt.Column('metro'), # <--
alt.Row('change'), # <--
).properties(
width=200,
height=100,
)
alt.Chart(df).mark_bar().encode(
x=alt.X("median_hh_income", bin=alt.BinParams(maxbins=50)),
y=alt.Y('count()'),
).properties(
width=200,
height=100,
).facet( # <--
column='metro',
row='change',
)
Pair plots#
alt.Chart(df).mark_circle().encode(
x=alt.X(alt.repeat("column"), type='quantitative'),
y=alt.Y(alt.repeat("row"), type='quantitative'),
color=alt.Color('change'),
).properties(
width=150,
height=150
).repeat(
row=['pop_change', 'median_hh_income'],
column=['median_hh_income', 'pop_change']
).interactive()